In [1]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
from sqlalchemy import create_engine
import os
import pymysql

In this Notebook, we are going to use the visualziation tools to show your some insight information about the Philddelphia House Data. Part I data are based on the orginal datasets. Part II data are based on the summary table.

Part I

In [5]:
phi_house_data_vis = pd.read_csv('../data/phi_opa_properties_public.csv')
# original file (large file >300MB) can be download from 
# https://www.opendataphilly.org/dataset/opa-property-assessments/resource/3084509c-82ad-4718-8ab2-474196eff98b
/Users/nli/Library/Python/3.8/lib/python/site-packages/IPython/core/interactiveshell.py:3062: DtypeWarning:

Columns (2,5,12,20,31,43,54,61,68,69,72,74) have mixed types.Specify dtype option on import or set low_memory=False.

In [6]:
# This data visualzation is based the original uncleaned data from Philly government
# You will have some senses what kinds of housing related descirption words has the most frequency in the Phiadelphia
phi_house_data_vis.head()
Out[6]:
the_geom lng assessment_date basements beginning_point book_and_page building_code building_code_description category_code category_code_description ... unit utility view_type year_built year_built_estimate zip_code zoning objectid lat number_of_bedrooms
0 0101000020E6100000D4E21F8673C952C0CBA8E9A316F7... -75.147676 NaN D 16'7" W HANCOCK 65522 O30 ROW 2 STY MASONRY 1 Single Family ... NaN NaN I 1920 NaN 191476127 RSA5 675202590 39.930378 3.0
1 0101000020E6100000A9112A846AC952C03BE14A5A0CF7... -75.147126 NaN D S W COR HOWARD ST 466484 O50 ROW 3 STY MASONRY 1 Single Family ... NaN NaN I 1920 Y 191476126 RSA5 675202591 39.930064 2.0
2 0101000020E6100000D01019376BC952C07F5658870CF7... -75.147169 NaN D 12'6" W HOWARD ST 709919 O50 ROW 3 STY MASONRY 1 Single Family ... NaN NaN I 1920 Y 191476126 RSA5 675202592 39.930070 2.0
3 0101000020E6100000BA5DAA196CC952C07BF4F20D0DF7... -75.147223 NaN A 24.250' NW OF HOWARD ST 3.18641e+06 P70 ROW W/GAR 4 STY MASONRY 1 Single Family ... NaN A I 2019 NaN 191476112 RM1 675202593 39.930086 4.0
4 0101000020E6100000BEB025446DC952C014FECA560DF7... -75.147294 NaN NaN 44.194' SW OF HOWARD ST 3.19248e+06 SR VACANT LAND RESIDE < ACRE 6 Vacant Land ... NaN NaN NaN 0 NaN 191476112 ICMX 675202594 39.930095 0.0

5 rows × 79 columns

In [7]:
phi_house_data_vis.columns
Out[7]:
Index(['the_geom', 'lng', 'assessment_date', 'basements', 'beginning_point',
       'book_and_page', 'building_code', 'building_code_description',
       'category_code', 'category_code_description', 'census_tract',
       'central_air', 'cross_reference', 'date_exterior_condition', 'depth',
       'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
       'frontage', 'fuel', 'garage_spaces', 'garage_type',
       'general_construction', 'geographic_ward', 'homestead_exemption',
       'house_extension', 'house_number', 'interior_condition', 'location',
       'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
       'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
       'market_value_date', 'number_of_bathrooms', 'the_geom_webmercator',
       'number_of_rooms', 'number_stories', 'off_street_open',
       'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
       'quality_grade', 'recording_date', 'registry_number', 'sale_date',
       'sale_price', 'separate_utilities', 'sewer', 'site_type', 'state_code',
       'street_code', 'street_designation', 'street_direction', 'street_name',
       'suffix', 'taxable_building', 'taxable_land', 'topography',
       'total_area', 'total_livable_area', 'type_heater', 'unfinished', 'unit',
       'utility', 'view_type', 'year_built', 'year_built_estimate', 'zip_code',
       'zoning', 'objectid', 'lat', 'number_of_bedrooms'],
      dtype='object')

Word Cloud: See which keywords have the high frequency in the dataset

In [8]:
# Street Name:
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white', 
                    width = 1920, 
                    height = 1080
                    ).generate(" ".join(phi_house_data_vis.street_name))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_street_name.jpg')
In [9]:
#category_code_description
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white', 
                    width = 1920, 
                    height = 1080
                    ).generate(" ".join(phi_house_data_vis.category_code_description))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_category_code_description.jpg')
In [10]:
# owner name
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white', 
                    width = 1920, 
                    height = 1080
                    ).generate(" ".join(phi_house_data_vis.owner_1))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_owner_name.jpg')
In [11]:
# building_code
plt.subplots(figsize = (25, 15))
wordcloud=WordCloud(background_color = 'white', 
                    width = 1920, 
                    height = 1080
                    ).generate(" ".join(phi_house_data_vis.building_code))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
cloud = wordcloud.to_file('../jupyter_nb/Vis_Figures/WC_building_code.jpg')
In [ ]:
 

Heatmap: House Location and Price Ditribution

In [12]:
#House Price less than $ 200,000 (> 1000)
geo_phi_pp_vis = phi_house_data_vis[(
    phi_house_data_vis.market_value >= 1000) & (phi_house_data_vis.market_value <= 200000)].plot(
    kind='scatter', x='lng', y='lat', label='residental houses',c='market_value', cmap=plt.get_cmap('spring'), 
                colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_less_200k.jpg')
In [13]:
geo_phi_pp_vis = phi_house_data_vis[(
    phi_house_data_vis.market_value >= 200000) & (phi_house_data_vis.market_value <= 800000)].plot(
                kind='scatter', x='lng', y='lat', 
                label='residental houses',c='market_value', cmap=plt.get_cmap('summer'), 
                colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_200k_800k.jpg')
In [14]:
geo_phi_pp_vis = phi_house_data_vis[(
    phi_house_data_vis.market_value > 800000) & (phi_house_data_vis.market_value <= 2000000)].plot(
                kind='scatter', x='lng', y='lat', 
                label='residental houses',c='market_value', cmap=plt.get_cmap('autumn'), 
                colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_800k_2M.jpg')
In [15]:
geo_phi_pp_vis = phi_house_data_vis[phi_house_data_vis.market_value >= 2000000].plot(kind='scatter', x='lng', y='lat', 
                label='residental houses',c='market_value', cmap=plt.get_cmap('winter'), 
                colorbar=True, alpha=0.4, figsize=(14,8))
geo_phi_pp_vis.legend()
geo_phi_pp_vis.figure.savefig('../jupyter_nb/Vis_Figures/phi_house_market_value_great_2M.jpg')
In [ ]:
 

Part II

The follwing data visualization data are come from Spark SQL at Databricks, based on 8 datasets with joint tables

In [16]:
zip_code_gb_phi_df = pd.read_csv('../data/summary_data/zip_code_gb_phi_house_df_summary.csv')
In [17]:
zip_code_gb_phi_df.head()
Out[17]:
zip_code number_zip_code_count avg_market_value avg_number_bedrooms avg_number_stories avg_taxable_building avg_taxable_land avg_total_area avg_livable_area avg_year_built avg_sale_price hospitals_count parks_recreation_count police_dept_stations_count healthy_corner_stores_count health_centers_count schools_count fire_dept_stations_count ave_sale_count_2017_to_2020_market_sale avg_sales_price_2017_to_2020_market_sale
0 19132 6895 50549 2.8 2.0 30908 8447 1033 1386 1924 57026 NaN 15.0 NaN 47.0 2.0 14.0 2.0 39.0 36829.0
1 19151 8108 145389 1.8 1.9 96239 24107 2261 1581 1932 117789 NaN 7.0 1.0 13.0 NaN 14.0 1.0 27.0 144686.0
2 19121 6186 220812 3.0 2.1 81288 45149 1301 1932 1934 214792 NaN 19.0 1.0 34.0 4.0 22.0 3.0 29.0 163343.0
3 19138 7144 110850 2.9 1.9 72611 16469 1837 1329 1925 77899 NaN 5.0 NaN 14.0 NaN 13.0 NaN 31.0 122286.0
4 19147 11106 447367 1.8 2.1 249633 111524 846 1944 1933 357846 NaN 32.0 1.0 16.0 2.0 12.0 4.0 54.0 463378.0
In [18]:
# Let's look at the describe of the summary data
zip_code_gb_phi_df.describe().round(1)
Out[18]:
zip_code number_zip_code_count avg_market_value avg_number_bedrooms avg_number_stories avg_taxable_building avg_taxable_land avg_total_area avg_livable_area avg_year_built avg_sale_price hospitals_count parks_recreation_count police_dept_stations_count healthy_corner_stores_count health_centers_count schools_count fire_dept_stations_count ave_sale_count_2017_to_2020_market_sale avg_sales_price_2017_to_2020_market_sale
count 47.0 47.0 47.0 47.0 47.0 47.0 47.0 47.0 47.0 47.0 47.0 22.0 46.0 20.0 34.0 22.0 46.0 40.0 46.0 46.0
mean 19130.3 7627.1 255808.3 2.2 1.8 154632.0 50730.0 2363.4 1729.6 1937.6 254399.3 1.6 11.2 1.2 19.7 2.0 11.8 1.7 36.5 244301.3
std 14.9 4256.5 194059.3 0.7 0.9 129289.4 36642.5 2249.6 454.5 18.2 312149.9 0.8 6.8 0.4 16.6 0.9 6.6 0.8 21.0 189254.7
min 19102.0 2.0 50549.0 0.4 0.6 29378.0 8168.0 202.0 1020.0 1911.0 54247.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 9.0 36829.0
25% 19119.5 4596.5 125803.5 1.8 1.4 80073.5 22522.5 1084.0 1376.5 1925.0 102578.5 1.0 6.0 1.0 7.0 1.0 6.2 1.0 20.2 122836.0
50% 19131.0 7526.0 211963.0 2.4 1.9 114934.0 41004.0 1787.0 1618.0 1931.0 146616.0 1.0 10.5 1.0 15.0 2.0 11.0 1.5 31.0 198185.0
75% 19142.5 9290.5 292820.0 2.8 2.0 173955.5 66006.0 3020.5 1932.5 1947.0 251303.5 2.0 14.8 1.0 26.8 3.0 15.0 2.0 50.5 256217.5
max 19154.0 16653.0 899973.0 3.0 7.0 663141.0 150294.0 14414.0 2940.0 2009.0 1944578.0 4.0 32.0 2.0 68.0 4.0 32.0 4.0 84.0 847886.0
In [19]:
zip_code_gb_phi_df.shape
Out[19]:
(47, 20)
In [20]:
zip_code_gb_phi_df.isnull().sum()
Out[20]:
zip_code                                     0
number_zip_code_count                        0
avg_market_value                             0
avg_number_bedrooms                          0
avg_number_stories                           0
avg_taxable_building                         0
avg_taxable_land                             0
avg_total_area                               0
avg_livable_area                             0
avg_year_built                               0
avg_sale_price                               0
hospitals_count                             25
parks_recreation_count                       1
police_dept_stations_count                  27
healthy_corner_stores_count                 13
health_centers_count                        25
schools_count                                1
fire_dept_stations_count                     7
ave_sale_count_2017_to_2020_market_sale      1
avg_sales_price_2017_to_2020_market_sale     1
dtype: int64
In [21]:
zip_code_gb_phi_df = zip_code_gb_phi_df.fillna(0)
In [22]:
zip_code_gb_phi_df.isnull().sum()
Out[22]:
zip_code                                    0
number_zip_code_count                       0
avg_market_value                            0
avg_number_bedrooms                         0
avg_number_stories                          0
avg_taxable_building                        0
avg_taxable_land                            0
avg_total_area                              0
avg_livable_area                            0
avg_year_built                              0
avg_sale_price                              0
hospitals_count                             0
parks_recreation_count                      0
police_dept_stations_count                  0
healthy_corner_stores_count                 0
health_centers_count                        0
schools_count                               0
fire_dept_stations_count                    0
ave_sale_count_2017_to_2020_market_sale     0
avg_sales_price_2017_to_2020_market_sale    0
dtype: int64

Bar Plot

In [23]:
# Average Market Price Group By Zip Zode 
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_market_value', color = 'number_zip_code_count', 
             template = 'plotly_dark', title = 'Average Market Value - Zip Code - Number of Zip Code Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
In [46]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_Mmarket_By_Zip_Code.jpg')
# Save a copy of jpg file for Report
In [25]:
# Average Taxable Building Group By Zip Zode 
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_taxable_building', color = 'number_zip_code_count', 
             template = 'plotly_dark', title = 'Average Taxable Building - Zip Code - Number of Zip Code Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_taxable_building (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
In [47]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_Taxable_Building_By_Zip_Code.jpg')
In [27]:
# Average Sale Price (publica records) Group By Zip Zode 
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_sale_price', color = 'number_zip_code_count', 
             template = 'plotly_dark', 
             title = 'Average Sales Price (Public Properties) - Zip Code - Number of Zip Code Count')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_sales_price (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
In [48]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_sales_Price_By_Zip_Code.jpg')
In [29]:
# Average Market Sales Price 2017 - 2020 Group By Zip Zode 
fig = px.bar(zip_code_gb_phi_df, x = 'zip_code', y= 'avg_sales_price_2017_to_2020_market_sale', color = 'ave_sale_count_2017_to_2020_market_sale', 
             template = 'plotly_white', 
             title = 'Melissa Agent Average Market Sales Price - Zip Code - Melissa Agent Average Market Sales Count (color)')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_sales_price_2017_to_2020 mkt (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_xaxes(nticks=53)
fig.show()
In [49]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_sales_Price_Melissa_By_Zip_Code.jpg')

Scatter Plot

In [33]:
# 3D scatter plot (major houses were built before 1955)
fig = px.scatter(zip_code_gb_phi_df, x="avg_year_built", y="avg_market_value", color="zip_code",
                 size='number_zip_code_count', template = 'ggplot2', 
                 title = 'Average Market Value - Average Year Built - Zip Code (color) ')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()
In [50]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_By_year_built_scatter.jpg')
In [35]:
fig = px.scatter(zip_code_gb_phi_df, x="avg_year_built", y="avg_sale_price", color="zip_code",
                 size='number_zip_code_count', template = 'seaborn',
                 title = 'Average Sales Price (Public Properties) - Average Year Built - Zip Code (color) ')
fig.update_layout(xaxis_tickformat = ' ')
fig.update_layout(yaxis_title="avg_market_value (USD)")
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()
In [51]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_sale_price_pp_By_year_built_scatter.jpg')

More Dimensional Scatter Plot (x, y, z, size, color)

In [37]:
# 4D plot
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'zip_code', y = 'avg_taxable_building', z='avg_year_built',  
                    color = 'avg_market_value', template = 'plotly_dark', 
title = 'x: Zip Code - y: Average Taxable Building(USD) - z: Average Year Built - color: Average Market Value(USD)')
fig.show()
In [52]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_yb_avg_tb_By_Zipcode_scatter_4d.jpg')
In [39]:
# More 4d plot on different parameter
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'avg_number_bedrooms', y = 'avg_livable_area', z='avg_number_stories',  
                    color = 'avg_market_value', template = 'plotly_dark', hover_data = ['zip_code'],
title = 'x: Avg Number Bedrooms - y: Avg livable Area - z: Avg Number Stories - color: Avg Market Value')
fig.show()
In [56]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_stories_avg_la_avg_bds_scatter_4d.jpg')
In [54]:
fig = px.scatter_3d(zip_code_gb_phi_df, x = 'healthy_corner_stores_count', y = 'schools_count', z='parks_recreation_count',  
                    color = 'avg_market_value', template = 'plotly', hover_data = ['zip_code'],
title = 'Average Market Value - Parks Recreation Count - Schools Counts - Healthy Corner Store Count')
fig.show()
In [57]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_avg_pr_cnt_avg_schl_cnt_avg_hcs_scatter_4d.jpg')
In [55]:
# 5D plot Demo
fig = px.scatter_3d(zip_code_gb_phi_df, x='schools_count', y = 'health_centers_count', z = 'parks_recreation_count', 
                   size='avg_market_value', color='zip_code', template = 'plotly_dark',
                   title = 'x: Schools Cnt - y: Health Centers Cnt - z: Parks Recreation Cnt - size: Average Mkt Value - color: Zip Code')

fig.show()
In [58]:
fig.write_image('../jupyter_nb/Vis_Figures/Avg_market_value_schl_cnt_hcs_count_pr_cnt_avg_market_value_zipcode_scatter_5d.jpg')

Here, we used higher dimensional plot show make the Phidelphia Housing Data Visible, and help us to understand how different factors affect the house market value price. Based on the data visulization result, the house at zipcode 19102, 19103, 19106, 19107, and 19118 are the TOP 5 high market value area. Higher market value price also contribute high tax. Zip code between 19145 to 19149 have high amount of sale. This indicates more population move in these zipcode area.

The average mean market value of Philadephia house is only around USD 255,000 , which very consistant with the market sale price from Melissa Agent data at around USD 240,000 (2017 - 2020)

In [179]:
# After Data Visualization, the summary data will be saved into the mysql database.
conn = create_engine("mysql+pymysql://" + os.environ.get("mysql_user") + ":" + os.environ.get("mysql_key") + "@localhost:3306/data_group_project")
df = pd.read_csv('../data/summary_data/zip_code_gb_phi_house_df_summary.csv', delimiter=',')
df.to_sql(name='zip_code_gb_phi_house_df_summary', con=conn, schema='data_group_project', if_exists='replace')
In [ ]: